Show the code
#|
vision_count <- openalexR::oa_fetch(
title_and_abstract.search = vision_st,
count_only = TRUE,
output = "list",
verbose = TRUE
)$countData Management Report
A short description what this is about. This is not a tracditional abstract, but rather something else …
IPBES_TCA_Ch2_technology
%The BuidNo is automatically increased by one each time the report is rendered. It is used to indicate different renderings when the version stays the same%.
All searches are done on all works in OpenAlex. The search in the TCA Corpus is not possibly at the moment, but we are working on it.
The search terms are based on the shared google doc. They are cleaned up for the usage in OpenAlex.
The search terms is vision
#|
vision_count <- openalexR::oa_fetch(
title_and_abstract.search = vision_st,
count_only = TRUE,
output = "list",
verbose = TRUE
)$countThe search terms is technology
#|
technology_count <- openalexR::oa_fetch(
title_and_abstract.search = compact(technology_st),
count_only = TRUE,
output = "list",
verbose = TRUE
)$countThe search term is vision AND technology
#|
vision_technology_count <-
openalexR::oa_fetch(
title_and_abstract.search = compact(paste0("(", vision_st, ") AND (", technology_st, ")")),
output = "list",
count_only = TRUE,
verbose = TRUE
)$count#|
vision_technology_subfields <- openalexR::oa_query(
title_and_abstract.search = compact(paste0("(", vision_st, ") AND (", technology_st, ")")),
group_by = "primary_topic.subfield.id",
verbose = TRUE
) |>
openalexR::oa_request() |>
dplyr::bind_rows() |>
dplyr::arrange(key)## clean up missing or wrong vision_technology_subfields$key_display_name
need_cleaning <- is.na(vision_technology_subfields$key_display_name) |
!is.na(as.numeric(vision_technology_subfields$key_display_name))Warning: NAs introduced by coercion
fine <- !need_cleaning
vision_technology_subfields <- vision_technology_subfields |>
dplyr::filter(fine) |>
dplyr::select(key, key_display_name) |>
dplyr::distinct() |>
merge(y = vision_technology_subfields[need_cleaning, -2], by = "key") |>
dplyr::bind_rows(vision_technology_subfields[fine, ]) |>
dplyr::group_by(key, key_display_name) |>
dplyr::summarize(count = sum(count))technology AND vision CorpusThe corpus download will be stored in data/pages and the arrow database in data/corpus.
This is not on github!
The corpus can be read by running get_corpus() which o[pens the database so that then it can be fed into a dplyr pipeline. After most dplyr functions, the actual data needs to be collected via collect().
Only then is the actual data read!
Needs to be enabled by setting eval: true in the code block below.
#|
tic()
pages_dir <- file.path(".", "data", "pages")
dir.create(
path = pages_dir,
showWarnings = FALSE,
recursive = TRUE
)
years <- oa_fetch(
title_and_abstract.search = compact(paste0("(", vision_st, ") AND (", technology_st, ")")),
group_by = "publication_year",
paging = "cursor",
verbose = FALSE
)$key
#######
#######
processed <- list.dirs(
path = pages_dir,
full.names = FALSE,
recursive = FALSE
) |>
gsub(
pattern = paste0("^pages_publication_year=", ""),
replacement = ""
)
interrupted <- list.files(
path = pages_dir,
pattern = "^next_page.rds",
full.names = TRUE,
recursive = TRUE
) |>
gsub(
pattern = paste0("^", pages_dir, "/pages_publication_year=", ""),
replacement = ""
) |>
gsub(
pattern = "/next_page.rds$",
replacement = ""
)
completed <- processed[!(processed %in% interrupted)]
years <- years[!(years %in% completed)]
#######
#######
pbmcapply::pbmclapply(
sample(years),
function(y) {
message("\nGetting data for year ", y, " ...")
output_path <- file.path(pages_dir, paste0("pages_publication_year=", y))
dir.create(
path = output_path,
showWarnings = FALSE,
recursive = TRUE
)
data <- oa_query(
title_and_abstract.search = compact(paste0("(", vision_st, ") AND (", technology_st, ")")),
publication_year = y,
options = list(
select = c("id", "doi", "authorships", "publication_year", "display_name", "abstract_inverted_index", "topics")
),
verbose = FALSE
) |>
IPBES.R::oa_request_IPBES(
count_only = FALSE,
output_path = output_path,
verbose = TRUE
)
},
mc.cores = 1,
mc.preschedule = FALSE
)
toc()The fields author and topics are serialized in the arrow database and need to be unserialized by using unserialize_arrow() on a dataset containing the two columns.
tic()
pages_dir <- file.path(".", "data", "pages")
arrow_dir <- file.path(".", "data", "corpus")
years <- list.dirs(
path = pages_dir,
full.names = TRUE,
recursive = FALSE
)
years_done <- list.dirs(
path = arrow_dir,
full.names = TRUE,
recursive = FALSE
)
years <- years[
!(
gsub(
x = years,
pattern = paste0("^", pages_dir, "/pages_publication_year="),
replacement = ""
) %in% gsub(
x = years_done,
pattern = paste0("^", arrow_dir, "/publication_year="),
replacement = ""
)
)
]
pbapply::pblapply(
years,
function(year) {
message("\n Processing year ", year, " ...\n")
pages <- list.files(
path = year,
pattern = "^page_",
full.names = TRUE,
recursive = TRUE
)
invisible(
pbmcapply::pbmclapply(
pages,
function(page) {
data <- readRDS(file.path(page))$results |>
openalexR::works2df(verbose = FALSE)
data$author_abbr <- IPBES.R::abbreviate_authors(data)
data <- serialize_arrow(data)
data$page <- page |>
basename() |>
gsub(pattern = "^page_", replacement = "") |>
gsub(pattern = ".rds$", replacement = "")
arrow::write_dataset(
data,
path = arrow_dir,
partitioning = c("publication_year", "page"),
format = "parquet",
existing_data_behavior = "overwrite"
)
},
mc.cores = 6 # params$mc.cores
)
)
}
)
toc()ids_technology <- read_corpus(file.path("data", "corpus")) |>
dplyr::select(id) |>
collect() |>
unlist()Warning: Invalid metadata$r
ids_tca <- read_corpus(file.path("..", "IPBES_TCA_Corpus", "data", "corpus")) |>
dplyr::select(id) |>
collect() |>
unlist()Warning: Invalid metadata$r
ids_subs_tca <- ids_technology[ids_technology %in% ids_tca]
arrow_tca_dir <- file.path(".", "data", "corpus_tca")
arrow_dir <- file.path(".", "data", "corpus")
year_dirs <- list.dirs(
path = arrow_dir,
full.names = TRUE,
recursive = FALSE
)
year_done <- list.dirs(
path = arrow_tca_dir,
full.names = TRUE,
recursive = FALSE
)
year_dirs <- year_dirs[!(basename(year_dirs) %in% basename(year_done))]
years <- basename(year_dirs) |>
gsub(
pattern = "publication_year=",
replacement = ""
)
ys <- seq_len(length(year_dirs))
pbapply::pblapply(
ys,
function(y) {
data <- read_corpus(year_dirs[[y]]) |>
dplyr::collect() |>
dplyr::filter(id %in% ids_subs_tca)
if (nrow(data) > 0) {
data |>
dplyr::mutate(publication_year = as.integer(years[[y]])) |>
arrow::write_dataset(
path = arrow_tca_dir,
partitioning = "publication_year",
format = "parquet",
existing_data_behavior = "overwrite"
)
}
}
)[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
[[4]]
NULL
[[5]]
NULL
[[6]]
NULL
[[7]]
NULL
[[8]]
NULL
[[9]]
NULL
[[10]]
NULL
[[11]]
NULL
[[12]]
NULL
[[13]]
NULL
[[14]]
NULL
[[15]]
NULL
[[16]]
NULL
[[17]]
NULL
[[18]]
NULL
[[19]]
NULL
[[20]]
NULL
[[21]]
NULL
[[22]]
NULL
[[23]]
NULL
[[24]]
NULL
[[25]]
NULL
[[26]]
NULL
[[27]]
NULL
[[28]]
NULL
[[29]]
NULL
[[30]]
NULL
[[31]]
NULL
[[32]]
NULL
[[33]]
NULL
[[34]]
NULL
[[35]]
NULL
[[36]]
NULL
[[37]]
NULL
[[38]]
NULL
[[39]]
NULL
[[40]]
NULL
[[41]]
NULL
[[42]]
NULL
[[43]]
NULL
[[44]]
NULL
[[45]]
NULL
[[46]]
NULL
[[47]]
NULL
[[48]]
NULL
[[49]]
NULL
[[50]]
NULL
[[51]]
NULL
[[52]]
NULL
[[53]]
NULL
[[54]]
NULL
[[55]]
NULL
[[56]]
NULL
[[57]]
NULL
[[58]]
NULL
[[59]]
NULL
[[60]]
NULL
[[61]]
NULL
[[62]]
NULL
[[63]]
NULL
[[64]]
NULL
[[65]]
NULL
[[66]]
NULL
[[67]]
NULL
[[68]]
NULL
[[69]]
NULL
[[70]]
NULL
[[71]]
NULL
[[72]]
NULL
[[73]]
NULL
[[74]]
NULL
[[75]]
NULL
[[76]]
NULL
[[77]]
NULL
[[78]]
NULL
[[79]]
NULL
[[80]]
NULL
[[81]]
NULL
[[82]]
NULL
[[83]]
NULL
[[84]]
NULL
[[85]]
NULL
[[86]]
NULL
[[87]]
NULL
[[88]]
NULL
[[89]]
NULL
[[90]]
NULL
[[91]]
NULL
[[92]]
NULL
[[93]]
NULL
[[94]]
NULL
[[95]]
NULL
[[96]]
NULL
[[97]]
NULL
[[98]]
NULL
[[99]]
NULL
[[100]]
NULL
[[101]]
NULL
[[102]]
NULL
[[103]]
NULL
[[104]]
NULL
[[105]]
NULL
[[106]]
NULL
[[107]]
NULL
[[108]]
NULL
[[109]]
NULL
[[110]]
NULL
[[111]]
NULL
[[112]]
NULL
[[113]]
NULL
[[114]]
NULL
[[115]]
NULL
toc()visionHits for search term vision: 105,605,001 hits
Individual terms cobmbined by OR:
#|
assess_search_term(readLines(file.path("input", "vision.txt"))) |>
dplyr::arrange(desc(count)) |>
dplyr::mutate(count = formatC(count, format = "f", big.mark = ",", digits = 0)) |>
knitr::kable()| term | count |
|---|---|
| model | 20,927,440 |
| process | 16,682,151 |
| effect | 13,632,994 |
| approach | 12,696,246 |
| value | 11,127,761 |
| activity | 9,857,218 |
| performance | 9,812,586 |
| technique | 9,165,469 |
| influence | 8,608,597 |
| response | 8,436,545 |
| relationships | 6,971,303 |
| objective | 6,739,843 |
| solution | 6,642,628 |
| strategy | 6,119,457 |
| image | 6,088,314 |
| view | 5,448,214 |
| future | 5,370,203 |
| target | 5,231,536 |
| reaction | 4,779,218 |
| knowledge | 4,741,397 |
| project | 4,299,961 |
| project | 4,299,961 |
| policy | 4,235,675 |
| action | 3,918,602 |
| plan | 3,912,838 |
| operation | 3,773,495 |
| culture | 3,633,033 |
| perspective | 3,389,608 |
| task | 2,801,091 |
| effort | 2,635,904 |
| government | 2,492,824 |
| idea | 2,255,473 |
| opportunity | 2,211,126 |
| transmission | 2,168,545 |
| respect | 2,071,852 |
| perception | 1,715,098 |
| platform | 1,640,040 |
| existence | 1,580,823 |
| movement | 1,514,491 |
| scenarios | 1,469,098 |
| innovation | 1,230,430 |
| desire | 1,137,800 |
| vision | 1,111,827 |
| visioning | 1,111,827 |
| reality | 977,987 |
| story | 950,186 |
| conceptual | 911,789 |
| motivation | 867,911 |
| appearance | 848,699 |
| responsibilities | 843,344 |
| visualization | 823,644 |
| initiative | 793,884 |
| moment | 787,457 |
| hope | 754,107 |
| discourse | 719,019 |
| iteration | 706,046 |
| cooperation | 682,796 |
| mission | 662,205 |
| territory | 460,720 |
| intention | 442,892 |
| agenda | 409,691 |
| wish | 404,894 |
| dialogue | 378,949 |
| consultation | 324,349 |
| aspiration | 300,008 |
| fiction | 293,234 |
| spiritual | 273,170 |
| co-production | 242,175 |
| imagery | 240,930 |
| creativity | 239,666 |
| universe | 230,428 |
| dream | 199,293 |
| sight | 191,279 |
| imagination | 169,465 |
| inspiration | 164,273 |
| cosmology | 159,668 |
| harmony | 120,342 |
| coalition | 110,398 |
| self-determination | 98,243 |
| solidarity | 92,470 |
| fantasy | 75,665 |
| roadmap | 67,724 |
| worldview | 66,307 |
| reciprocity | 59,863 |
| ceremony | 59,657 |
| “collective action” | 40,651 |
| visionary | 28,403 |
| programm | 26,415 |
| foresight | 23,229 |
| “participatory process” | 6,498 |
| cosmovision | 2,896 |
| “deliberate process” | 607 |
| communit | 449 |
| cosmocentric | 98 |
| languague | 83 |
| arquetype | 5 |
technologyHits for search term technology: 14,937,469 hits
Individual terms cobmbined by OR:
#|
assess_search_term(readLines(file.path("input", "technology.txt"))) |>
dplyr::arrange(desc(count)) |>
dplyr::mutate(count = formatC(count, format = "f", big.mark = ",", digits = 0)) |>
knitr::kable()| term | count |
|---|---|
| Technology | 6,358,098 |
| Software | 2,467,986 |
| Machine-to-Machine | 2,131,976 |
| Internet | 1,245,794 |
| “Social Media” | 1,059,530 |
| Virtualization | 1,033,802 |
| Robotics | 774,867 |
| “Machine Learning” | 675,928 |
| “Deep Learning” | 394,634 |
| “Artificial Intelligence” | 325,704 |
| “Renewable Energy” | 255,935 |
| “Biotechnology” | 235,095 |
| IOT | 213,696 |
| “Big Data” | 191,660 |
| “Internet of Things” | 180,973 |
| “Computer Vision” | 129,572 |
| “Virtual Reality” | 124,486 |
| “Cloud Computing” | 121,643 |
| E-commerce | 118,827 |
| Nanotechnology | 100,913 |
| 5G | 95,726 |
| Blockchain | 89,136 |
| “Natural Language Processing” | 79,992 |
| “Augmented Reality” | 67,666 |
| “Speech Recognition” | 66,188 |
| “3D Printing” | 65,530 |
| “Smart Grid” | 56,562 |
| “Genetic Engineering” | 41,896 |
| “Genetic engineering” | 41,896 |
| “Autonomous Vehicle” | 41,368 |
| “Digital Transformation” | 40,298 |
| “Circular Economy” | 37,977 |
| Cybersecurity | 35,691 |
| “Clean Energy” | 33,823 |
| “Blockchain Technology” | 32,238 |
| “Data Science” | 30,990 |
| “Edge Computing” | 30,247 |
| “Cyber-Physical Systems” | 27,882 |
| “Smart Home” | 26,349 |
| “Quantum Computing” | 25,652 |
| “Digital Twin” | 23,059 |
| Cryptocurrency | 22,193 |
| “Space Technology” | 17,503 |
| Fintech | 14,852 |
| “Application Programming Interface” | 13,424 |
| “Mixed Reality” | 11,753 |
| “Facial Recognition” | 9,962 |
| “Wearable Technology” | 8,145 |
| Microservices | 7,575 |
| “Sustainable Technology” | 7,070 |
| “Digital Currency” | 5,743 |
| “Agile Development” | 4,866 |
| “Computational Technology” | 3,668 |
| DevOps | 3,491 |
| “Digital Wallet” | 966 |
| “Internet Safety” | 923 |
| “Internet Privacy” | 742 |
| “Digital Ethics” | 479 |
vision AND technologyHits for search term vision: 12,003,981 hits
The subfields are based on the main topic assigned to each work. There are other topics also assigned, but this one has been identified as the main topic by an algorythm. count is the number of works in the vision AND technology corpus which have been assigned to the subfield.
Please take a look at these subfields of the topics to identify the ones to be filtered out.
The easies would be to download the Excel file through the button and to mark the subfields to be filtered out.
IPBES.R::table_dt(vision_technology_subfields, fixedColumns = NULL, fn = "Vision Technology Subfields")@report{krug,
author = {Krug, Rainer M.},
title = {Report {Assessment} {Ch2} {Technology} {Visions}},
doi = {XXXXXX},
langid = {en},
abstract = {A short description what this is about. This is not a
tracditional abstract, but rather something else ...}
}